| Alumno | Código |
|---|---|
| Valeria Llactayo Peña | 17160046 |
| Joaquin Romualdo Peña | 17150052 |
| Javier Sullcaray Barrientos | 17160200 |
| Jose Victorio Gonzales | 17160051 |
| Daniel Yauri Leiva | 17160208 |
En el presente trabajo se muestra el proceso de manipulación de datos de incendios forestales.
#Comenzaremos subiendo los datos que usaremos
load("prep_eda.RData")
# Librerias
library(tidyverse)
library(VIM)
library(skimr)
library(psych)
library(GGally)
library(visdat)
library(plotly)
library(viridis)
library(hrbrthemes)
# Dataset
fires
## # A tibble: 517 x 13
## X Y month day FFMC DMC DC ISI temp RH wind rain area
## <dbl> <dbl> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7 5 Mar 5 86.2 26.2 94.3 5.1 8.2 51 6.7 0 0
## 2 7 4 Oct 2 90.6 35.4 669. 6.7 18 33 0.9 0 0
## 3 7 4 Oct 6 90.6 43.7 687. 6.7 14.6 33 1.3 0 0
## 4 8 6 Mar 5 91.7 33.3 77.5 9 8.3 97 4 0.2 0
## 5 8 6 Mar 7 89.3 51.3 102. 9.6 11.4 99 1.8 0 0
## 6 8 6 Aug 7 92.3 85.3 488 14.7 22.2 29 5.4 0 0
## 7 8 6 Aug 1 92.3 88.9 496. 8.5 24.1 27 3.1 0 0
## 8 8 6 Aug 1 91.5 145. 608. 10.7 8 86 2.2 0 0
## 9 8 6 Sep 2 91 130. 693. 7 13.1 63 5.4 0 0
## 10 7 5 Sep 6 92.5 88 699. 7.1 22.8 40 4 0 0
## # ... with 507 more rows
# Resumenes estadísticos
summary(fires)
## X Y month day FFMC
## Min. :1.000 Min. :2.0 Aug :184 1:74 Min. :18.70
## 1st Qu.:3.000 1st Qu.:4.0 Sep :172 2:64 1st Qu.:90.20
## Median :4.000 Median :4.0 Mar : 54 3:54 Median :91.60
## Mean :4.669 Mean :4.3 Jul : 32 4:61 Mean :90.64
## 3rd Qu.:7.000 3rd Qu.:5.0 Feb : 20 5:85 3rd Qu.:92.90
## Max. :9.000 Max. :9.0 Jun : 17 6:84 Max. :96.20
## (Other): 38 7:95
## DMC DC ISI temp
## Min. : 1.1 Min. : 7.9 Min. : 0.000 Min. : 2.20
## 1st Qu.: 68.6 1st Qu.:437.7 1st Qu.: 6.500 1st Qu.:15.50
## Median :108.3 Median :664.2 Median : 8.400 Median :19.30
## Mean :110.9 Mean :547.9 Mean : 9.022 Mean :18.89
## 3rd Qu.:142.4 3rd Qu.:713.9 3rd Qu.:10.800 3rd Qu.:22.80
## Max. :291.3 Max. :860.6 Max. :56.100 Max. :33.30
##
## RH wind rain area
## Min. : 15.00 Min. :0.400 Min. :0.00000 Min. : 0.00
## 1st Qu.: 33.00 1st Qu.:2.700 1st Qu.:0.00000 1st Qu.: 0.00
## Median : 42.00 Median :4.000 Median :0.00000 Median : 0.52
## Mean : 44.29 Mean :4.018 Mean :0.02166 Mean : 12.85
## 3rd Qu.: 53.00 3rd Qu.:4.900 3rd Qu.:0.00000 3rd Qu.: 6.57
## Max. :100.00 Max. :9.400 Max. :6.40000 Max. :1090.84
##
describe(fires)
## vars n mean sd median trimmed mad min max range skew
## X 1 517 4.67 2.31 4.00 4.67 2.97 1.0 9.00 8.00 0.04
## Y 2 517 4.30 1.23 4.00 4.31 1.48 2.0 9.00 7.00 0.41
## month* 3 517 7.48 2.28 8.00 7.79 1.48 1.0 12.00 11.00 -1.21
## day* 4 517 4.26 2.07 5.00 4.32 2.97 1.0 7.00 6.00 -0.21
## FFMC 5 517 90.64 5.52 91.60 91.45 1.93 18.7 96.20 77.50 -6.54
## DMC 6 517 110.87 64.05 108.30 106.52 51.74 1.1 291.30 290.20 0.54
## DC 7 517 547.94 248.07 664.20 578.69 118.90 7.9 860.60 852.70 -1.09
## ISI 8 517 9.02 4.56 8.40 8.73 3.11 0.0 56.10 56.10 2.52
## temp 9 517 18.89 5.81 19.30 19.09 5.34 2.2 33.30 31.10 -0.33
## RH 10 517 44.29 16.32 42.00 42.71 14.83 15.0 100.00 85.00 0.86
## wind 11 517 4.02 1.79 4.00 3.90 1.93 0.4 9.40 9.00 0.57
## rain 12 517 0.02 0.30 0.00 0.00 0.00 0.0 6.40 6.40 19.70
## area 13 517 12.85 63.66 0.52 3.18 0.77 0.0 1090.84 1090.84 12.77
## kurtosis se
## X -1.18 0.10
## Y 1.38 0.05
## month* 0.61 0.10
## day* -1.29 0.09
## FFMC 66.14 0.24
## DMC 0.18 2.82
## DC -0.27 10.91
## ISI 21.15 0.20
## temp 0.11 0.26
## RH 0.41 0.72
## wind 0.03 0.08
## rain 415.60 0.01
## area 191.50 2.80
Cantidad de Missing Values
##
## Variables sorted by number of missings:
## Variable Count
## X 0
## Y 0
## month 0
## day 0
## FFMC 0
## DMC 0
## DC 0
## ISI 0
## temp 0
## RH 0
## wind 0
## rain 0
## area 0
Distribución espacial de incendios
**** Distribución de cantidad de incendios por mes ****
****
Pieplot de cantidad de incendios por mes
Distribución de variables por boxplot
# Distribución de variables por densidad
library(ggridges)
ggplot(fires_long2, aes(x = value, y = variable)) +
geom_density_ridges() +
facet_wrap(~month, scales = "free")
ggplot(fires_long2, aes(x = value, y = variable, fill = stat(quantile))) +
stat_density_ridges(quantile_lines = FALSE,
calc_ecdf = TRUE,
geom = "density_ridges_gradient") +
scale_fill_brewer(name = "") +
facet_wrap(~month, scales = "free")
# Boxplot áreas por mes
ggplot(fires, aes(x = month, y = area)) +
geom_boxplot(aes(fill = month, alpha = 0.5)) +
geom_jitter(color="red", size=1.5, alpha=0.5, width = 0.1) +
theme_ipsum_tw() +
facet_wrap(~month, scales = "free") +
labs(
title = "Áreas quemadas por mes",
subtitle = "Áreas en hectáreas",
caption = "Elaboración propia"
) +
theme_ipsum_tw(grid="XY", axis="xy") +
theme(legend.position = "none")
# Boxplot áreas != 0.0 ha
fires_ars <-
fires %>%
dplyr::filter(
area >= 1
) %>%
dplyr::select(-rain)
summary(fires_ars)
## X Y month day FFMC
## Min. :1.000 Min. :2.000 Sep :90 1:36 Min. :63.50
## 1st Qu.:3.000 1st Qu.:4.000 Aug :83 2:30 1st Qu.:90.35
## Median :5.000 Median :4.000 Mar :18 3:26 Median :91.70
## Mean :4.819 Mean :4.379 Jul :16 4:30 Mean :90.98
## 3rd Qu.:7.000 3rd Qu.:5.000 Feb :10 5:39 3rd Qu.:92.90
## Max. :9.000 Max. :9.000 Dec : 9 6:39 Max. :96.20
## (Other):17 7:43
## DMC DC ISI temp
## Min. : 3.2 Min. : 15.3 Min. : 0.800 Min. : 2.20
## 1st Qu.: 81.8 1st Qu.:470.6 1st Qu.: 6.800 1st Qu.:15.90
## Median :111.7 Median :665.6 Median : 8.400 Median :19.90
## Mean :114.6 Mean :569.2 Mean : 9.044 Mean :19.09
## 3rd Qu.:141.2 3rd Qu.:724.1 3rd Qu.:10.900 3rd Qu.:23.35
## Max. :291.3 Max. :860.6 Max. :21.300 Max. :33.30
##
## RH wind area
## Min. :15.00 Min. :0.400 Min. : 1.01
## 1st Qu.:33.00 1st Qu.:2.700 1st Qu.: 2.99
## Median :41.00 Median :4.000 Median : 7.19
## Mean :43.72 Mean :4.125 Mean : 27.27
## 3rd Qu.:53.00 3rd Qu.:4.900 3rd Qu.: 18.07
## Max. :96.00 Max. :9.400 Max. :1090.84
##
describe(fires_ars)
## vars n mean sd median trimmed mad min max range skew
## X 1 243 4.82 2.35 5.00 4.84 2.97 1.00 9.00 8.00 -0.03
## Y 2 243 4.38 1.14 4.00 4.37 1.48 2.00 9.00 7.00 0.47
## month* 3 243 7.74 2.19 8.00 8.06 1.48 2.00 12.00 10.00 -1.23
## day* 4 243 4.21 2.07 4.00 4.27 2.97 1.00 7.00 6.00 -0.18
## FFMC 5 243 90.98 3.81 91.70 91.53 1.78 63.50 96.20 32.70 -2.69
## DMC 6 243 114.64 63.56 111.70 110.75 43.88 3.20 291.30 288.10 0.54
## DC 7 243 569.16 236.21 665.60 603.22 118.16 15.30 860.60 845.30 -1.23
## ISI 8 243 9.04 4.13 8.40 8.78 2.97 0.80 21.30 20.50 0.67
## temp 9 243 19.09 6.35 19.90 19.43 5.49 2.20 33.30 31.10 -0.46
## RH 10 243 43.72 15.40 41.00 42.30 14.83 15.00 96.00 81.00 0.78
## wind 11 243 4.12 1.89 4.00 3.96 1.93 0.40 9.40 9.00 0.68
## area 12 243 27.27 90.81 7.19 11.45 7.43 1.01 1090.84 1089.83 8.89
## kurtosis se
## X -1.20 0.15
## Y 1.51 0.07
## month* 1.30 0.14
## day* -1.30 0.13
## FFMC 12.49 0.24
## DMC 0.38 4.08
## DC 0.23 15.15
## ISI 0.28 0.26
## temp 0.03 0.41
## RH 0.13 0.99
## wind 0.17 0.12
## area 91.13 5.83
ggplot(fires_ars, aes(x = month, y = area)) +
geom_boxplot(aes(fill = month, alpha = 0.5)) +
geom_jitter(color="red", size=1.5, alpha=0.5, width = 0.1) +
theme_ipsum_tw() +
facet_wrap(~month, scales = "free") +
labs(
title = "Áreas quemadas significativas por mes",
subtitle = "Áreas mayores a 0 hectáreas",
caption = "Elaboración propia"
) +
theme_ipsum_tw(grid="XY", axis="xy") +
theme(legend.position = "none")
library(ggridges)
ggplot(fires, aes(x = temp, y = month, fill = stat(x))) +
geom_density_ridges_gradient(quantile_lines = TRUE, alpha = 0.75,
quantiles = c(0.05, 0.5, 0.95)) +
scale_fill_viridis_c(name = "T (°C)", option = "C") +
theme_ipsum() +
labs(x = "Temperatura", y = "Mes")
# Data filtrada por el periodo de mayor ocurrencia de incendios
fires_per # Se consideran áreas de 0 ha
## # A tibble: 356 x 13
## X Y month FFMC DMC DC ISI temp RH wind rain area grid
## <dbl> <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 8 6 8 92.3 85.3 488 14.7 22.2 29 5.4 0 0 8-6
## 2 8 6 8 92.3 88.9 496. 8.5 24.1 27 3.1 0 0 8-6
## 3 8 6 8 91.5 145. 608. 10.7 8 86 2.2 0 0 8-6
## 4 8 6 9 91 130. 693. 7 13.1 63 5.4 0 0 8-6
## 5 7 5 9 92.5 88 699. 7.1 22.8 40 4 0 0 7-5
## 6 7 5 9 92.5 88 699. 7.1 17.8 51 7.2 0 0 7-5
## 7 7 5 9 92.8 73.2 713 22.6 19.3 38 4 0 0 7-5
## 8 6 5 8 63.5 70.8 665. 0.8 17 72 6.7 0 0 6-5
## 9 6 5 9 90.9 126. 686. 7 21.3 42 2.2 0 0 6-5
## 10 6 5 9 92.9 133. 700. 9.2 26.4 21 4.5 0 0 6-5
## # ... with 346 more rows
fires_pa <-
fires_per %>%
dplyr::filter(
area > 0.00
)
ggplot(fires_pa, aes(x = "Aug - Sep", y = area)) +
geom_boxplot() +
geom_jitter(color="red", size=1.5, alpha=0.5) +
theme_ipsum_tw() +
labs(
title = "Áreas para el periodo Aug - Sep",
subtitle = "Áreas mayores a 0 hectáreas",
caption = "Elaboración propia"
) +
theme(legend.position = "none")
fires_pap <- # Sin contar areas > 600 ha
fires_pa %>%
dplyr::filter(
area < 600
)
Data para procesamiento
# Data para procesamiento
fires_pca
## # A tibble: 33 x 7
## FFMC DMC DC ISI temp RH area
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 91 130. 693. 7 18.8 40 213.
## 2 91 276. 825. 7.1 21.9 43 70.8
## 3 91.7 191. 636. 7.8 19.9 50 82.8
## 4 93.5 149. 729. 8.1 27.8 27 95.2
## 5 92.5 121. 674. 8.6 18.2 46 201.
## 6 91.6 108. 764 6.2 18 51 0
## 7 81.6 56.7 666. 1.9 21.9 71 54.3
## 8 94.9 130. 587. 14.1 33.1 25 26.4
## 9 92.2 102. 752. 8.4 24.2 27 6.58
## 10 93.3 141. 714. 13.9 18.6 49 35.9
## # ... with 23 more rows
Matriz de correlación
# Matriz de correlación
ggpairs(fires_area[,-c(1:3, 12)])
cor(fires_area[, -c(1:3, 12)])
## FFMC DMC DC ISI temp RH
## FFMC 1.00000000 0.32055252 -0.17227512 0.76054335 0.38085037 -0.4479377
## DMC 0.32055252 1.00000000 0.39014412 0.10424878 0.21615230 -0.2140209
## DC -0.17227512 0.39014412 1.00000000 -0.21263329 -0.03560824 -0.3408187
## ISI 0.76054335 0.10424878 -0.21263329 1.00000000 0.47877095 -0.4167733
## temp 0.38085037 0.21615230 -0.03560824 0.47877095 1.00000000 -0.6477947
## RH -0.44793766 -0.21402086 -0.34081874 -0.41677330 -0.64779472 1.0000000
## wind 0.03999076 0.04661713 -0.17986357 0.31983991 0.31833445 0.0263021
## area 0.09032888 0.08060299 0.01856502 0.04024941 0.18000670 -0.2748217
## wind area
## FFMC 0.03999076 0.09032888
## DMC 0.04661713 0.08060299
## DC -0.17986357 0.01856502
## ISI 0.31983991 0.04024941
## temp 0.31833445 0.18000670
## RH 0.02630210 -0.27482173
## wind 1.00000000 0.08161989
## area 0.08161989 1.00000000
corrplot::corrplot(cor(fires_area[, -c(1:3, 12)]))
cor_fires <- cor(fires_area[-c(1:3, 12)])
cor.plot(cor_fires, order = "hclust")
#cargando los datos
fires_pca
## # A tibble: 33 x 7
## FFMC DMC DC ISI temp RH area
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 91 130. 693. 7 18.8 40 213.
## 2 91 276. 825. 7.1 21.9 43 70.8
## 3 91.7 191. 636. 7.8 19.9 50 82.8
## 4 93.5 149. 729. 8.1 27.8 27 95.2
## 5 92.5 121. 674. 8.6 18.2 46 201.
## 6 91.6 108. 764 6.2 18 51 0
## 7 81.6 56.7 666. 1.9 21.9 71 54.3
## 8 94.9 130. 587. 14.1 33.1 25 26.4
## 9 92.2 102. 752. 8.4 24.2 27 6.58
## 10 93.3 141. 714. 13.9 18.6 49 35.9
## # ... with 23 more rows
#describiendo los datos
library(corrplot)
library(PerformanceAnalytics)
library(lattice)
library(cptcity)
library(cluster)
describe(fires_pca)
## vars n mean sd median trimmed mad min max range skew
## FFMC 1 33 91.92 2.81 91.70 92.29 1.48 81.6 96.00 14.40 -1.81
## DMC 2 33 143.22 57.08 130.30 138.88 32.47 47.9 276.30 228.40 0.73
## DC 3 33 682.94 119.46 692.60 695.43 53.37 100.7 825.10 724.40 -3.36
## ISI 4 33 9.13 4.15 7.80 8.74 2.08 1.9 20.00 18.10 0.94
## temp 5 33 22.42 4.80 21.90 22.33 5.49 12.9 33.10 20.20 0.17
## RH 6 33 40.61 15.02 36.00 38.93 13.34 21.0 80.00 59.00 0.90
## area 7 33 109.00 221.42 42.87 57.29 59.13 0.0 1090.84 1090.84 3.33
## kurtosis se
## FFMC 4.40 0.49
## DMC -0.05 9.94
## DC 14.32 20.80
## ISI 0.28 0.72
## temp -0.77 0.84
## RH -0.09 2.61
## area 10.82 38.54
#ploteo
#esttandarizando los datos
fires_esc <- scale(fires_pca) %>% as_tibble() %>%
print()
## # A tibble: 33 x 7
## FFMC DMC DC ISI temp RH area
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -0.329 -0.240 0.0808 -0.514 -0.754 -0.0404 0.469
## 2 -0.329 2.33 1.19 -0.489 -0.107 0.159 -0.173
## 3 -0.0797 0.844 -0.394 -0.321 -0.524 0.625 -0.119
## 4 0.560 0.107 0.382 -0.249 1.12 -0.906 -0.0624
## 5 0.205 -0.388 -0.0715 -0.128 -0.879 0.359 0.415
## 6 -0.115 -0.610 0.679 -0.706 -0.920 0.692 -0.492
## 7 -3.67 -1.52 -0.145 -1.74 -0.107 2.02 -0.247
## 8 1.06 -0.226 -0.802 1.20 2.23 -1.04 -0.373
## 9 0.0980 -0.717 0.574 -0.177 0.372 -0.906 -0.463
## 10 0.489 -0.0354 0.259 1.15 -0.795 0.559 -0.330
## # ... with 23 more rows
#matriz de correlación
correlation <- cor(fires_esc) %>%
print()
## FFMC DMC DC ISI temp RH
## FFMC 1.00000000 0.32055252 -0.17227512 0.76054335 0.38085037 -0.4479377
## DMC 0.32055252 1.00000000 0.39014412 0.10424878 0.21615230 -0.2140209
## DC -0.17227512 0.39014412 1.00000000 -0.21263329 -0.03560824 -0.3408187
## ISI 0.76054335 0.10424878 -0.21263329 1.00000000 0.47877095 -0.4167733
## temp 0.38085037 0.21615230 -0.03560824 0.47877095 1.00000000 -0.6477947
## RH -0.44793766 -0.21402086 -0.34081874 -0.41677330 -0.64779472 1.0000000
## area 0.09032888 0.08060299 0.01856502 0.04024941 0.18000670 -0.2748217
## area
## FFMC 0.09032888
## DMC 0.08060299
## DC 0.01856502
## ISI 0.04024941
## temp 0.18000670
## RH -0.27482173
## area 1.00000000
#obteniendo la matríz de correlación
corrplot(correlation,
method = "number",
type = "full",
diag = TRUE,
tl.col = "black",
bg = "white",
title = "Correlation Matrix",
col = NULL)
#matriz de varianza-covarianza
matrix_cov <- cov(fires_esc) %>% print()
## FFMC DMC DC ISI temp RH
## FFMC 1.00000000 0.32055252 -0.17227512 0.76054335 0.38085037 -0.4479377
## DMC 0.32055252 1.00000000 0.39014412 0.10424878 0.21615230 -0.2140209
## DC -0.17227512 0.39014412 1.00000000 -0.21263329 -0.03560824 -0.3408187
## ISI 0.76054335 0.10424878 -0.21263329 1.00000000 0.47877095 -0.4167733
## temp 0.38085037 0.21615230 -0.03560824 0.47877095 1.00000000 -0.6477947
## RH -0.44793766 -0.21402086 -0.34081874 -0.41677330 -0.64779472 1.0000000
## area 0.09032888 0.08060299 0.01856502 0.04024941 0.18000670 -0.2748217
## area
## FFMC 0.09032888
## DMC 0.08060299
## DC 0.01856502
## ISI 0.04024941
## temp 0.18000670
## RH -0.27482173
## area 1.00000000
#suma de la diagonal o "varianza total"
diag(matrix_cov) %>% sum()
## [1] 7
chart.Correlation(fires_esc, histogram = T, pch = 20)
# levelplot(
# correlation,
# col.regions =
# cpt(
# pal = "cb_div_RdBu_11", n = 100
# )
# )
#calculando eigenvectors y eigenvalues
eigen <- eigen(matrix_cov) %>% print()
## eigen() decomposition
## $values
## [1] 2.7314494 1.5214766 1.0291060 0.7940452 0.5463880 0.2548680 0.1226670
##
## $vectors
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] -0.48573787 -0.242090101 0.2635235 0.292683927 -0.2912026 -0.42108920
## [2,] -0.25211932 0.449221855 0.4236410 0.530073593 0.4287243 -0.01328501
## [3,] -0.03714229 0.727612240 0.1487127 -0.207889575 -0.4076672 0.32186496
## [4,] -0.47207059 -0.353132132 0.1746827 0.008223907 -0.2928079 0.65948299
## [5,] -0.46540188 -0.003302866 -0.1879590 -0.376182752 0.6523866 0.19226501
## [6,] 0.47933850 -0.247561872 0.2067998 0.396781359 0.2038618 0.47734491
## [7,] -0.17291637 0.155474898 -0.7876290 0.539558604 -0.1109033 0.13845045
## [,7]
## [1,] 0.53686822
## [2,] -0.30034121
## [3,] 0.36616627
## [4,] -0.31813695
## [5,] 0.37943906
## [6,] 0.48921191
## [7,] 0.05455753
#calculando el pca
library(ade4)
pca <- dudi.pca(fires_esc,
scale = F,
scannf = F,
nf = ncol(fires_esc)
)
summary(pca)
## Class: pca dudi
## Call: dudi.pca(df = fires_esc, scale = F, scannf = F, nf = ncol(fires_esc))
##
## Total inertia: 6.788
##
## Eigenvalues:
## Ax1 Ax2 Ax3 Ax4 Ax5
## 2.6487 1.4754 0.9979 0.7700 0.5298
##
## Projected inertia (%):
## Ax1 Ax2 Ax3 Ax4 Ax5
## 39.021 21.735 14.702 11.344 7.806
##
## Cumulative projected inertia (%):
## Ax1 Ax1:2 Ax1:3 Ax1:4 Ax1:5
## 39.02 60.76 75.46 86.80 94.61
##
## (Only 5 dimensions (out of 7) are shown)
#viendo los autovalores
pca$eig
## [1] 2.6486782 1.4753712 0.9979210 0.7699832 0.5298308 0.2471447 0.1189498
#verificando que la suma de los autovalores sea 7
sum(pca$eig)
## [1] 6.787879
#viendo los autovectores
pca$c1
## CS1 CS2 CS3 CS4 CS5 CS6
## FFMC -0.48573787 -0.242090101 -0.2635235 0.292683927 -0.2912026 -0.42108920
## DMC -0.25211932 0.449221855 -0.4236410 0.530073593 0.4287243 -0.01328501
## DC -0.03714229 0.727612240 -0.1487127 -0.207889575 -0.4076672 0.32186496
## ISI -0.47207059 -0.353132132 -0.1746827 0.008223907 -0.2928079 0.65948299
## temp -0.46540188 -0.003302866 0.1879590 -0.376182752 0.6523866 0.19226501
## RH 0.47933850 -0.247561872 -0.2067998 0.396781359 0.2038618 0.47734491
## area -0.17291637 0.155474898 0.7876290 0.539558604 -0.1109033 0.13845045
## CS7
## FFMC 0.53686822
## DMC -0.30034121
## DC 0.36616627
## ISI -0.31813695
## temp 0.37943906
## RH 0.48921191
## area 0.05455753
#Scree plot
library(factoextra)
fviz_screeplot(pca,
addlabels = TRUE,
barfill = "lightblue1",
barcolor = "lightblue1",
linecolor= "indianred2",
ggtheme = theme_grey())
#correlacion entre las variable soriginales y las componetes
levelplot(
as.matrix(pca$co),
col.regions =
cpt(
pal = "cb_div_RdBu_11", n = 100, rev = F
),
xlab = "Variables",
ylab = "Componentes"
)
#Varianza explicada o contribución de las variables
contrib <- as.matrix(pca$co ^ 2)
#Gráfico de contribución
corrplot(contrib,
method = "circle",
tl.col = "black",
bg = "white",
title = "Contributions",
is.corr = F)
as.tibble(fires_esc)
## # A tibble: 33 x 7
## FFMC DMC DC ISI temp RH area
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -0.329 -0.240 0.0808 -0.514 -0.754 -0.0404 0.469
## 2 -0.329 2.33 1.19 -0.489 -0.107 0.159 -0.173
## 3 -0.0797 0.844 -0.394 -0.321 -0.524 0.625 -0.119
## 4 0.560 0.107 0.382 -0.249 1.12 -0.906 -0.0624
## 5 0.205 -0.388 -0.0715 -0.128 -0.879 0.359 0.415
## 6 -0.115 -0.610 0.679 -0.706 -0.920 0.692 -0.492
## 7 -3.67 -1.52 -0.145 -1.74 -0.107 2.02 -0.247
## 8 1.06 -0.226 -0.802 1.20 2.23 -1.04 -0.373
## 9 0.0980 -0.717 0.574 -0.177 0.372 -0.906 -0.463
## 10 0.489 -0.0354 0.259 1.15 -0.795 0.559 -0.330
## # ... with 23 more rows
head(pca$li)
## Axis1 Axis2 Axis3 Axis4 Axis5 Axis6
## 1 0.70987271 0.29718048 0.5023348 0.2759903 -0.4417954 -0.2702754
## 2 -0.08511407 2.09971023 -1.1817873 0.8987875 0.7350938 0.1991528
## 3 0.55635525 0.05378052 -0.5432282 0.8847904 0.4382798 -0.1346976
## 4 -1.14160982 0.48903837 0.1430041 -0.6764099 0.3540450 -0.5036471
## 5 0.57078750 -0.25178004 0.2309192 0.5653815 -0.7050283 -0.1287344
## 6 1.36309593 0.25211865 -0.3925650 -0.1487619 -0.7025230 -0.1053718
## Axis7
## 1 -0.1913240
## 2 -0.2574199
## 3 -0.2378134
## 4 0.4671501
## 5 0.1059234
## 6 0.5569388
dim(pca$li)
## [1] 33 7
#df de componentes principales
pca_data <- as_tibble(pca$li) %>%
dplyr::select(sprintf("Axis%1$s", 1:3)) %>%
print()
## # A tibble: 33 x 3
## Axis1 Axis2 Axis3
## <dbl> <dbl> <dbl>
## 1 0.710 0.297 0.502
## 2 -0.0851 2.10 -1.18
## 3 0.556 0.0538 -0.543
## 4 -1.14 0.489 0.143
## 5 0.571 -0.252 0.231
## 6 1.36 0.252 -0.393
## 7 4.06 0.178 1.30
## 8 -2.46 -1.17 0.0673
## 9 -0.332 0.285 0.116
## 10 -0.0849 -0.538 -0.878
## # ... with 23 more rows
library(FactoMineR)
fviz_pca_ind(pca, col.var = "cos2",
geom.ind = "point",
col.ind = "royalblue3",
labelsize = 3,
repel = FALSE,
ggtheme = theme_grey())
fviz_pca_biplot(pca, col.var = "cos2",
geom.var = "arrow",
geom.ind = "point",
repel = FALSE,
addEllipses = TRUE,
col.ind = "tomato2",
ggtheme = theme_grey(),
title = "PCA BIPLOT")
library(rgl)
pc <- plot_ly(pca_data, x = ~Axis1, y = ~Axis2, z = ~Axis3, color = "blue")
print(pc)
#Visualización en 3d
library(pca3d)
prc <- prcomp(fires_esc, scale.=F)
pca3d(prc, components = 1:3,
col = "royalblue1",
bg = "gray31",
axes.color = "sandybrown",
new = TRUE)
## [1] 0.08110631 0.10661512 0.07856704
## Creating new device
rglwidget()
#distancias
pca3d(prc, components = 1:3,
col = "royalblue1",
bg = "black",
axes.color = "sandybrown",
show.shadows = TRUE,
new = TRUE)
## [1] 0.08110631 0.10661512 0.07856704
## Creating new device
rglwidget()
#elipse
pca3d(prc, components = 1:3,
col = "royalblue1",
bg = "white",
axes.color = "sandybrown",
show.ellipses=TRUE,
ellipse.ci=0.9, show.plane=FALSE,
new = TRUE)
## [1] 0.08110631 0.10661512 0.07856704
## Creating new device
rglwidget()
#Matriz de distancias Para este caso se usará la distancia euclideana, por ser la que mejor se acomoda a los nuestros datos.
distancia<- dist(pca_data, method = "euclidean")
#Heatmap Como una visualizacion previa de lo que podria ser el numero de clusters, se desarrolla el heatmap, no sin antes haber convertido a matriz nuestra data de distancia.
d <- as.matrix(distancia)
heatmap(d)
fviz_dist(dist.obj = distancia, lab_size = 10,
gradient = list(low = "#FC4E07", mid = "white", high = "#00AFBB"))
#clusterizando con el metodo ward Para este caso se utilizará una técnica de clusterizacion jerarquica, con la técnica Ward D, el cual trabaja con las sumatorias de la distancia al centroide.
hmodel<- hclust(distancia, method = "ward.D")
#dendograma Se plotea el dendograma para darnos una idea de como se agrupan nuestras observaciones en cada cluster, y la cantidad de estas.
plot(hmodel)
#obteniendo las alturas Para definir el corte del dendograma, necesitamos hallar la altura del corte, para eso extraemos la columna de alturas del hmodel.
names <- "height"
heights <- as.data.frame(hmodel$height) #hay 28 alturas
#ploteando las alturas Ploteamos el grafico de alturas y observaciones y definimos que el punto de quiebre se da a partir de la observacion numero 30, lo que significa que este modelo agrupa la data en 3 clusters.
ggplot(heights, aes(x = index(heights), y = hmodel$height)) +
geom_line(color = "royalblue2", # Color de la linea
lwd = 1, # Ancho de la linea
linetype = 1) +
geom_vline(xintercept=30, color = "orange") +
geom_point(color = "royalblue4")+ labs( x = "Observaciones", y = "Altura",
title ="Visualización de alturas")+
theme(plot.title = element_text(hjust = 0.5))
#Obteniendo el valor de la altura
(hmodel$height)[30]
## [1] 8.821526
#cortando el dendograma
clust<- cutree(hmodel, k = 3)
length(clust)
## [1] 33
#numero de elementos en cada cluster
table(clust)
## clust
## 1 2 3
## 17 9 7
#visualizando los grupo en el dendograma
fviz_dend(hmodel, k = 3, # Cortar en 3 grupos
cex = 0.5, # Tamaño de las etiquetas
k_colors = c("#2E9FDF", "#00AFBB", "#E7B800"),
color_labels_by_k = TRUE, # color labels by groups
rect = TRUE, # Add rectangle around groups
rect_border = c("#2E9FDF", "#00AFBB", "#E7B800"),
rect_fill = TRUE)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning in if (color == "cluster") color <- "default": la condición tiene
## longitud > 1 y sólo el primer elemento será usado
#visualizando los grupos
fviz_cluster(list(data = pca_data, cluster = clust),
palette = c("#2E9FDF", "#00AFBB", "#E7B800"),
ellipse.type = "convex", # Concentration ellipse
repel = TRUE, # Avoid label overplotting (slow)
show.clust.cent = TRUE, ggtheme = theme_minimal()
)
library(cluster)
#Uniendo el numero de cluster al data frame
col <- c("#2E9FDF", "#00AFBB", "#E7B800")
data2 <- dplyr::bind_cols(pca_data, cluster = as.numeric(clust))
#visualizando en 3d
library(plotly)
hmodel3d <- plot_ly(data2, x = ~Axis1, y = ~Axis2, z = ~Axis3,
marker = list(color = ~cluster, colorscale = c('#FFE1A1', '#683531'), showscale = FALSE))
hmodel3d
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
library(car)
library(scatterplot3d)
pca_data
## # A tibble: 33 x 3
## Axis1 Axis2 Axis3
## <dbl> <dbl> <dbl>
## 1 0.710 0.297 0.502
## 2 -0.0851 2.10 -1.18
## 3 0.556 0.0538 -0.543
## 4 -1.14 0.489 0.143
## 5 0.571 -0.252 0.231
## 6 1.36 0.252 -0.393
## 7 4.06 0.178 1.30
## 8 -2.46 -1.17 0.0673
## 9 -0.332 0.285 0.116
## 10 -0.0849 -0.538 -0.878
## # ... with 23 more rows
# Indicadores para K óptimo
# Valor de silueta
set.seed(2021)
fviz_nbclust(
pca_data, kmeans, method = "silhouette",
k.max = 15
)
# Valor de suma de cuadrados totales
set.seed(2021)
fviz_nbclust(
pca_data, kmeans, method = "wss",
k.max = 15
)
# NbClust for determining the best number of clusters
library(NbClust)
nb <- NbClust::NbClust(pca_data,
distance = "euclidean",
min.nc = 2, max.nc = 20,
method = "kmeans",
index = "all")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 3 proposed 2 as the best number of clusters
## * 6 proposed 3 as the best number of clusters
## * 2 proposed 5 as the best number of clusters
## * 1 proposed 6 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 12 as the best number of clusters
## * 2 proposed 13 as the best number of clusters
## * 2 proposed 17 as the best number of clusters
## * 2 proposed 19 as the best number of clusters
## * 3 proposed 20 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
fviz_nbclust(nb)
## Among all indices:
## ===================
## * 2 proposed 0 as the best number of clusters
## * 1 proposed 1 as the best number of clusters
## * 3 proposed 2 as the best number of clusters
## * 6 proposed 3 as the best number of clusters
## * 2 proposed 5 as the best number of clusters
## * 1 proposed 6 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 12 as the best number of clusters
## * 2 proposed 13 as the best number of clusters
## * 2 proposed 17 as the best number of clusters
## * 2 proposed 19 as the best number of clusters
## * 3 proposed 20 as the best number of clusters
##
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is 3 .
nb$Best.nc # Valores máximos de cada indicador
## KL CH Hartigan CCC Scott Marriot TrCovW
## Number_clusters 3.0000 19.0000 13.0000 17.0000 7.0000 3.00 3.0000
## Value_Index 33.6854 64.5483 10.3022 5.6188 43.0845 26026.89 768.9477
## TraceW Friedman Rubin Cindex DB Silhouette Duda
## Number_clusters 3.0000 17.0000 13.0000 12.0000 20.0000 20.0000 2.0000
## Value_Index 19.0599 77.4363 -6.4272 0.2435 0.4192 0.6004 1.3669
## PseudoT2 Beale Ratkowsky Ball PtBiserial Frey McClain
## Number_clusters 2.000 2.0000 5.0000 3.0000 6.0000 1 3.0000
## Value_Index -8.053 -0.4301 0.3796 29.5099 0.5765 NA 0.5803
## Dunn Hubert SDindex Dindex SDbw
## Number_clusters 19.0000 0 5.0000 0 20.0000
## Value_Index 0.6362 0 2.1293 0 0.0072
# K-means
set.seed(2021)
c3 = kmeans(pca_data, centers = 3, iter.max = 100,
nstart = 100)
c3$cluster
## [1] 2 2 2 3 2 2 2 3 2 2 2 2 3 3 2 3 3 1 2 2 2 3 3 2 2 3 2 2 3 3 2 2 3
c3$centers
## Axis1 Axis2 Axis3
## 1 1.9041828 -5.3307561 0.0413673
## 2 0.9444667 0.3795228 -0.1653002
## 3 -1.7327931 -0.1883083 0.2720531
# Silueta por cada cluster
silueta <- cluster::silhouette(c3$cluster, dist(pca_data))
fviz_silhouette(silueta)
## cluster size ave.sil.width
## 1 1 1 0.00
## 2 2 20 0.39
## 3 3 12 0.34
#Gráfico de clusters basados en sus centroides
centroides_c3 = data.frame(c3$centers)
centroides_c3$kmeans3 = 1:3
library(reshape)
centroides_km3_2 = reshape::melt(centroides_c3, id=c("kmeans3"))
str(centroides_km3_2)
## 'data.frame': 9 obs. of 3 variables:
## $ kmeans3 : int 1 2 3 1 2 3 1 2 3
## $ variable: Factor w/ 3 levels "Axis1","Axis2",..: 1 1 1 2 2 2 3 3 3
## $ value : num 1.904 0.944 -1.733 -5.331 0.38 ...
library(ggplot2)
ggplot(centroides_km3_2, aes(x=kmeans3, y=value, fill=variable))+
geom_bar(stat="identity", position="dodge")
# Cluster con PCA
cluster::clusplot(pca_data, c3$cluster, color=T, labels=2)
# * grafico en 3D ----
pc = princomp(fires_pca_esc, cor = T, scores = T)
ls(pc)
## [1] "call" "center" "loadings" "n.obs" "scale" "scores" "sdev"
head(pc$scores)
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6
## [1,] -0.99476586 -0.7256432 0.2187196 1.2686308 0.3459760 -0.04017855
## [2,] 0.08133548 -1.8648166 0.1888528 -1.7759907 0.8409043 -0.37866522
## [3,] -0.49523818 0.1177687 0.1668085 -0.7612738 0.8659261 -0.31701276
## [4,] 1.01305336 -0.7758514 0.1538842 0.5043303 -0.6645919 -0.65028613
## [5,] -0.91419220 -0.3724056 0.7927401 1.5892586 0.6641811 0.09232420
## [6,] -1.07059169 0.4039000 -0.4879735 -1.3309747 -0.2047690 1.17963038
## Comp.7 Comp.8
## [1,] -0.07337053 -0.23629428
## [2,] -0.04677559 -0.25668831
## [3,] 0.21439304 -0.21182802
## [4,] 0.15620701 0.47404577
## [5,] -0.35382580 0.02508472
## [6,] 0.75853243 0.71420830
class(head(pc$scores))
## [1] "matrix" "array"
pca_data
## # A tibble: 33 x 3
## Axis1 Axis2 Axis3
## <dbl> <dbl> <dbl>
## 1 0.710 0.297 0.502
## 2 -0.0851 2.10 -1.18
## 3 0.556 0.0538 -0.543
## 4 -1.14 0.489 0.143
## 5 0.571 -0.252 0.231
## 6 1.36 0.252 -0.393
## 7 4.06 0.178 1.30
## 8 -2.46 -1.17 0.0673
## 9 -0.332 0.285 0.116
## 10 -0.0849 -0.538 -0.878
## # ... with 23 more rows
summary(pc)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
## Standard deviation 1.6712591 1.2667889 1.0307082 0.9790796 0.89086371
## Proportion of Variance 0.3491384 0.2005943 0.1327949 0.1198246 0.09920477
## Cumulative Proportion 0.3491384 0.5497326 0.6825275 0.8023522 0.90155693
## Comp.6 Comp.7 Comp.8
## Standard deviation 0.69781481 0.42347941 0.34823017
## Proportion of Variance 0.06086819 0.02241685 0.01515803
## Cumulative Proportion 0.96242512 0.98484197 1.00000000
biplot(pc)
#install.packages('rgl')
library(rgl)
plot3d(pc$scores[, 1:3], col = c4$cluster, size = 10)
rglwidget() # grafico aparece en Viewer
c3_cluster_f = as.factor(c3$cluster) #En factor para graficar con scatter
c3_cluster_f
## [1] 2 2 2 3 2 2 2 3 2 2 2 2 3 3 2 3 3 1 2 2 2 3 3 2 2 3 2 2 3 3 2 2 3
## Levels: 1 2 3
scatter3d(x = pc$scores[,1], y = pc$scores[,2], z = pc$scores[,3],
xlab = "PC1", ylab = "PC2" , zlab = "PC3",
groups = c3_cluster_f, surface=FALSE)
rglwidget() # grafico aparece en Viewer
# Validación
library(clusterSim)
# Indice Davies - Boulding
val <- c3$cluster
index <- index.DB(pca_data, val, centrotypes = "centroids")
index$DB
## [1] 0.8492732
# Indice Dunn
library(clValid)
dunn(distance = NULL, Data = pca_data, clusters = val)
## [1] 0.05618022
# Variables originales
c3$cluster
## [1] 2 2 2 3 2 2 2 3 2 2 2 2 3 3 2 3 3 1 2 2 2 3 3 2 2 3 2 2 3 3 2 2 3
fires_cluster <-
fires_pca %>%
mutate(
clust = c3$cluster
)
fires_cluster
## # A tibble: 33 x 8
## FFMC DMC DC ISI temp RH area clust
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 91 130. 693. 7 18.8 40 213. 2
## 2 91 276. 825. 7.1 21.9 43 70.8 2
## 3 91.7 191. 636. 7.8 19.9 50 82.8 2
## 4 93.5 149. 729. 8.1 27.8 27 95.2 3
## 5 92.5 121. 674. 8.6 18.2 46 201. 2
## 6 91.6 108. 764 6.2 18 51 0 2
## 7 81.6 56.7 666. 1.9 21.9 71 54.3 2
## 8 94.9 130. 587. 14.1 33.1 25 26.4 3
## 9 92.2 102. 752. 8.4 24.2 27 6.58 2
## 10 93.3 141. 714. 13.9 18.6 49 35.9 2
## # ... with 23 more rows
table(fires_cluster$clust)
##
## 1 2 3
## 1 20 12
#Hkmeans -----------------
# Compute hierarchical k-means clustering
library(factoextra)
hkmodel <-hkmeans(pca_data, 3)
hkmodel$centers
## Axis1 Axis2 Axis3
## 1 0.3505248 0.4866362 -0.3133842
## 2 2.4338417 -0.8623223 0.3353830
## 3 -1.8374042 -0.2374768 0.2728954
# Elements returned by hkmeans()
names(hkmodel)
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault" "data"
## [11] "hclust"
# Visualize the tree
fviz_dend(hkmodel, cex = 0.6, palette = "jco",
rect = TRUE, rect_border = "jco", rect_fill = TRUE)
# Visualize the hkmeans final clusters
fviz_cluster(hkmodel,
palette = c("#2E9FDF", "#00AFBB", "#E7B800"),
ellipse.type = "convex", # Concentration ellipse
repel = TRUE, # Avoid label overplotting (slow)
show.clust.cent = TRUE,
ggtheme = theme_minimal()
)
length(hkmodel$cluster)
## [1] 33
head(hkmodel$data)
## # A tibble: 6 x 3
## Axis1 Axis2 Axis3
## <dbl> <dbl> <dbl>
## 1 0.710 0.297 0.502
## 2 -0.0851 2.10 -1.18
## 3 0.556 0.0538 -0.543
## 4 -1.14 0.489 0.143
## 5 0.571 -0.252 0.231
## 6 1.36 0.252 -0.393
table(hkmodel$cluster)
##
## 1 2 3
## 16 6 11
library(plotly)
datahk <- dplyr::bind_cols(hkmodel$data, cluster = as.numeric(clust))
hkmodel <- plot_ly(datahk, x = ~Axis1, y = ~Axis2, z = ~Axis3,
marker = list(color = ~cluster, colorscale = c('#FFE1A1', '#683531'), showscale = FALSE))
hkmodel